import requests
import re
import time
import json
import multiprocessing


class HeadleLaGou(object):
    def __init__(self):
        self.lagou_session = requests.session()
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/53",
        }
        self.city_lisrt = ""

    # 获取所有城市列表
    def handle_city(self):
        city_search = re.compile(r'www\.lagou\.com\/.*\/">(.*?)</a>')
        city_url = "https://www.lagou.com/jobs/allCity.html"
        city_result = self.handle_request(method="GET", url=city_url)
        # 获取城市列表
        self.city_list = city_search.findall(city_result)
        # 清除session
        self.lagou_session.cookies.clear()

    def handle_city_job(self, city):
        first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
        first_response = self.handle_request(
            method="GET", url=first_request_url)
        total_page_search = re.compile(r'class="span\stotalNum">(\d+)</span>')
        try:
            total_page = total_page_search.search(first_response).group(1)
            # 没有岗位信息
        except:
            return
        else:
            for i in range(1, int(total_page)+1):
                data = {
                    "pn": i,
                    "kd": "python"
                }
                page_url = "https://www.lagou.com/jobs/positionAjax.json?city=%s&needAddtionalResult=false" % city
                referer_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % city
                # referer的URL需要进行encode
                self.header['Referer'] = referer_url.encode()
                response = self.handle_request(
                    method="POST", url=page_url, data=data, info=city)
                print(response)
                lagou_data = json.loads(response)
                job_list = lagou_data['content']['positionResult']['result']
                for job in job_list:
                    print(job)
                    # lagou_mysql.insert_item(job)

    # 请求
    def handle_request(self, method, url, data=None, info=None):
        while True:
            # 加入阿布云代理
            # 加入阿布云的动态代理
            # proxyinfo = "http://%s:%s@%s:%s" % (
            #     'H86B17NI273651JD', '665655F16F2AAAED', 'http-dyn.abuyun.com', '9020')
            # proxy = {
            #     "http": proxyinfo,
            #     "https": proxyinfo
            # }
            if method == 'GET':
                response = self.lagou_session.get(
                    url=url, headers=self.header,  timeout=6)
                    # url=url, headers=self.header, proxies=proxy,timeout=6)
            elif method == 'POST':
                response = self.lagou_session.post(
                    url=url, headers=self.header, data=data, timeout=6)
                    # url=url, headers=self.header, proxies=proxy, data=data, timeout=6)
            self.lagou_session.cookies.clear()
            if '频繁' in response.text:
                print('频繁')
                print(response.text)
                # 需要先清除cookies信息
                self.lagou_session.cookies.clear()
                # 重新获取cookies信息
                first_request_url = "https://www.lagou.com/jobs/list_python?city=%s&cl=false&fromSearch=true&labelWords=&suginput=" % info
                self.handle_request(method="GET", url=first_request_url)
                time.sleep(10)
                continue
            return response.text


if __name__ == '__main__':
    lagou = HeadleLaGou()
    # 所有城市的方法
    lagou.handle_city()
    print(lagou.city_list)
    # 引入多进程加速抓取
    pool = multiprocessing.Pool(2)
    for city in lagou.city_list:
        pool.apply_async(lagou.handle_city_job, args=(city,))
    pool.close()
    pool.join()
